rm(list=ls())

library(openxlsx)
library(ggplot2)
library(ggthemes)
library(reshape2)
library(readr)
library(tokenizers)
library(dplyr)
library(countrycode)
library(survival)
library(zoo)
library(ggthemes)
library(tidyr)
library(sp)
library(plyr)
library(estprod)
library(stringr)


recode_countries <- function(vec, scheme_1, scheme_2){
  # new values come first
  recode_vals <- as.character(unique(countryref[countryref$type == 'country', scheme_2]))
  names(recode_vals) <- as.character(unique(countryref[countryref$type == 'country', scheme_1]))
  
  recode_vals <- recode_vals[!is.na(names(recode_vals))]
  
  
  vec <- recode(vec, !!!recode_vals)
  return(vec)
}

borders_unga_speeches <- read.csv('kw_relevance_predicted_set.csv', stringsAsFactors = FALSE)
full_unga_speeches <- read.csv('UNGAplenary_tiles_v5.csv', stringsAsFactors = FALSE)

# filter non-relevant undergrad speeches
borders_unga_speeches <- borders_unga_speeches %>% select(-starts_with('x')) %>% filter(Predicted == 1)

full_unga_speeches$n_words <- count_words(full_unga_speeches$text)
borders_unga_speeches$n_words <- count_words(borders_unga_speeches$text)

full_unga_speeches$n_words <- count_words(full_unga_speeches$text)
borders_unga_speeches$n_words <- count_words(borders_unga_speeches$text)

######################
# Emotion extraction #
######################
library(tidytext)
library(readtext)
library(dplyr)
library(lsa)
library(spacyr)
library(stringr)

rheault_anxiety <- read_csv('anxiety-lexicon.csv')

unnested <- unnest_tokens(borders_unga_speeches %>% select(country, year, text, doc_id) %>% mutate(text = str_replace_all(text, 'United\\s+Nations|Council|General\\s+Assembly|Organization', ' ')),
                          word,text)

all_unnested <- unnest_tokens(full_unga_speeches %>% select(country, year, text, doc_id) %>% mutate(text = str_replace_all(text, 'United Nations|Council|General Assembly|Organization', ' ')),
                              word,text)

# reimplementing rheault's anxiety extraction approach
cosine_high <- function(x){
  length(x)
  anchor_high %>% rowwise() %>% 
    dplyr::mutate(high_sim = cosine(x, c_across(X2:X301))) %>% 
    select(high_sim) %>% ungroup() %>% 
    dplyr::summarise(sum(high_sim)) %>% unlist() %>% unname()
}

cosine_low <- function(x){
  length(x)
  anchor_low %>% rowwise() %>% 
    dplyr::mutate(low_sim = cosine(x, c_across(X2:X301))) %>% 
    select(low_sim) %>% ungroup() %>% 
    dplyr::summarise(sum(low_sim)) %>% unlist() %>% unname()
}

glove <- read_delim("glove.6B.300d.txt", 
                    delim = " ", col_names = FALSE, quote="")
names(glove)[1] <- 'lemma'

anchor_high <- rheault_anxiety %>% filter(anxiety == 1) %>%
  left_join(glove, by='lemma')

anchor_low <- rheault_anxiety %>% filter(anxiety == -1) %>%
  left_join(glove, by='lemma')

tokens <- data.frame(lemma = c(unnested$word, all_unnested$word)) %>%
  group_by(lemma) #%>% filter(n() > 5)
length(unique(tokens$lemma))

anxiety_scores <- glove %>% filter(lemma %in% unique(tokens$lemma)) %>% rowwise() %>% 
  dplyr::mutate(high = cosine_high(c_across(X2:X301)),
                low = cosine_low(c_across(X2:X301))) %>%
  select(lemma, high, low) %>% 
  mutate(score = (high - low),
         scaled_score = 2*(score - min(score))/(max(score) - min(score)) - 1) #this line of code can take a few hours to run

anxiety_counts <- unnested %>%  
  inner_join(anxiety_scores, by = c('word' = 'lemma')) %>%
  group_by(year) %>% dplyr::summarise(mean_anxiety = mean(scaled_score))

all_anxiety_counts <- all_unnested %>% 
  inner_join(anxiety_scores, by = c('word' = 'lemma')) %>%
  group_by(year) %>% dplyr::summarise(mean_anxiety = mean(scaled_score))

combined_anxiety <- rbind(anxiety_counts %>% mutate(type = 'borders'),
                          all_anxiety_counts %>% mutate(type = 'all'))  


# Figure 7
ggplot(combined_anxiety, aes(x=year, y=mean_anxiety, color=type)) + 
  geom_smooth(fill='grey85', alpha=1) + 
  ylab('Anxiety Prevalence') + xlab(NULL) + 
  scale_color_manual(labels=c('All Documents', 'Borders Only'), name=NULL, values=c('blue', 'red')) + 
  scale_x_continuous(breaks = c(1970, 1990, 2010)) + 
  ggtitle('Anxiety, all documents vs. border-relevant (higher = more anxious)') + 
  theme(legend.position = 'bottom',
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))

# anxiety scores, border vs non-border 
unnested %>% 
  inner_join(anxiety_scores, by = c('word' = 'lemma')) %>%
  group_by(country, year) %>% dplyr::summarise(mean_anxiety = mean(scaled_score)) #%>%
  #write_csv('C:/Users/rober/Dropbox/Borders - Sentiments/data/country_anxiety.csv')

anxiety_scores_comparison <- all_unnested %>% 
  inner_join(anxiety_scores, by = c('word' = 'lemma')) %>%
  group_by(country, year) %>% dplyr::summarise(mean_anxiety_all = mean(scaled_score)) %>%
  left_join(unnested %>% inner_join(anxiety_scores, by = c('word' = 'lemma')) %>%
              group_by(country, year) %>% dplyr::summarise(mean_anxiety_borders = mean(scaled_score)),
            by=c('country','year'))

anxiety_scores_comparison %>% 
  group_by(country) %>% 
  dplyr::summarise(anxiety_diff = sum(mean_anxiety_borders - mean_anxiety_all, na.rm=TRUE)) %>% 
  arrange((desc(anxiety_diff)))

###########################################
# re-implementing, but with anger instead #
###########################################
cosine_high <- function(x){
  length(x)
  anchor_high %>% rowwise() %>% 
    dplyr::mutate(high_sim = cosine(x, c_across(X2:X301))) %>% 
    select(high_sim) %>% ungroup() %>% 
    dplyr::summarise(sum(high_sim)) %>% unlist() %>% unname()
}

cosine_low <- function(x){
  length(x)
  anchor_low %>% rowwise() %>% 
    dplyr::mutate(low_sim = cosine(x, c_across(X2:X301))) %>% 
    select(low_sim) %>% ungroup() %>% 
    dplyr::summarise(sum(low_sim)) %>% unlist() %>% unname()
}

glove <- read_delim("glove.6B.300d.txt", 
                    delim = " ", col_names = FALSE, quote="")
names(glove)[1] <- 'lemma'

# from here: https://www.thesaurus.com/browse/anger
anchor_high <- data.frame('lemma' = c('acrimony', 'animosity', 'annoyance', 'antagonism',
                                      'displeasure', 'emnity', 'exasperation', 'fury', 'hatred',
                                      'impatience', 'indignation', 'ire', 'irritation', 'outrage',
                                      'passion', 'rage', 'resentment', 'temper', 'violence', 'anger'))

anchor_low <- data.frame('lemma' = c('calm', 'cheer', 'comfort', 'delight', 'ease', 'glee', 'happiness',
                                     'love', 'peace', 'pleasure', 'agreeable', 'contentment', 'enjoyment',
                                     'pleasant', 'pacify', 'placate'))

anchor_high <- anchor_high %>%
  left_join(glove, by='lemma')

anchor_low <- anchor_low %>%
  left_join(glove, by='lemma')

tokens <- data.frame(lemma = c(unnested$word, all_unnested$word)) %>%
  group_by(lemma) #%>% filter(n() > 5)
length(unique(tokens$lemma))

anger_scores <- glove %>% filter(lemma %in% unique(tokens$lemma)) %>% rowwise() %>%  #this line of code can take a few hours to run
  dplyr::mutate(high = cosine_high(c_across(X2:X301)),
                low = cosine_low(c_across(X2:X301))) %>%
  select(lemma, high, low) %>% 
  mutate(score = (high - low),
         scaled_score = 2*(score - min(score))/(max(score) - min(score)) - 1)

anger_counts <- unnested %>% 
  inner_join(anger_scores, by = c('word' = 'lemma')) %>%
  group_by(year) %>% dplyr::summarise(mean_anger = mean(scaled_score))

all_anger_counts <- all_unnested %>% 
  inner_join(anger_scores, by = c('word' = 'lemma')) %>%
  group_by(year) %>% dplyr::summarise(mean_anger = mean(scaled_score))

combined_anger <- rbind(anger_counts %>% mutate(type = 'borders'),
                        all_anger_counts %>% mutate(type = 'all'))

unnested %>% 
  inner_join(anger_scores, by = c('word' = 'lemma')) %>%
  group_by(country, year) %>% dplyr::summarise(mean_anger = mean(scaled_score)) #%>%
  #write_csv('country_anger.csv')

anger_scores_comparison <- all_unnested %>% 
  inner_join(anger_scores, by = c('word' = 'lemma')) %>%
  group_by(country, year) %>% dplyr::summarise(mean_anger_all = mean(scaled_score)) %>%
  left_join(unnested %>% inner_join(anger_scores, by = c('word' = 'lemma')) %>%
              group_by(country, year) %>% dplyr::summarise(mean_anger_borders = mean(scaled_score)),
            by=c('country','year'))

anger_scores_comparison %>% 
  group_by(country) %>% 
  dplyr::summarise(anger_diff = sum(mean_anger_borders - mean_anger_all, na.rm=TRUE)) %>% 
  arrange((desc(anger_diff)))

# Figure F1 
combined_emotions <- anger_scores_comparison %>% left_join(anxiety_scores_comparison, by=c('country', 'year'))

combined_emotions %>% select(-mean_anger_all, -mean_anxiety_all) %>%
  pivot_longer(cols=starts_with("mean")) %>%
  mutate(name = recode_factor(name, mean_anger_borders='Anger', mean_anxiety_borders='Anxiety')) %>%
  ggplot(aes(x=year, y=value, color=name)) + geom_point() + geom_smooth() + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        legend.position='bottom') + 
  xlab(NULL) + ylab("Emotion Score") + 
  scale_color_discrete(name='Emotion')

###########################################################
# Generating a null distribution by sampling random words #
###########################################################
library(quanteda)
set.seed(11345)
null_word_idx <- sample(1:length(unique(tokens$lemma)), 1000)
terms <- paste('\\W', paste(unique(tokens$lemma[null_word_idx]), collapse='\\W|\\W'), '\\W', sep='')
full_unga_speeches[,'null_word'] <- str_count(full_unga_speeches$text, pattern = regex(terms, ignore_case=TRUE)) #this line can take a while to run
full_unga_speeches[,'border_words'] <- str_count(full_unga_speeches$text, pattern = regex('border|boundar|delimit|demarcat|transborder|transboundar', ignore_case=TRUE))
full_unga_speeches[,'length'] <- ntoken(full_unga_speeches$text)
term_data_condensed <- full_unga_speeches %>% group_by(year) %>% 
  dplyr::summarise(null_word_normalized = sum(null_word/length/n()),
                   border_normalized =  sum(border_words/length/n()))
to_plot <- melt(term_data_condensed, id.vars='year') %>%
  mutate(variable = recode(variable, null_word_normalized = "Random terms (normalized)",
                           border_normalized = "Border terms (normalized)"))

# Figure B1
gg <- ggplot(to_plot, aes(x=year, y=value, color=variable)) + 
  geom_smooth() + geom_point() + 
  facet_wrap(~variable, nrow=2, scales='free') + 
  theme(legend.position = 'none',
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) + xlab(NULL) + ylab(NULL)
